import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pymc3 as pm
import theano.tensor as tt
from theano.compile.ops import as_op
from scipy.stats import norm
from matplotlib import gridspec
from matplotlib.patches import Rectangle
from IPython.display import Image
import seaborn as sns
import pymc3 as pm
import numpy as np
from scipy.stats import norm
import pandas as pd
import theano.tensor as T
from theano.compile.ops import as_op
%matplotlib notebook
plt.style.use('seaborn-white')
color = '#87ceeb'
f_dict = {'size':14}
df2 = pd.read_csv('../data/pseudonymized-data.csv', dtype={'Community':'category',
'V27':'category',
'V28':'category',
'V29':'category',
'V30':'category',
'V31':'category',
'V32':'category',
'V33':'category'})
df2.info()
# reduce data values by 1 so that they become indexes for the cdf array below.
# the .cat.codes function from pandas doesn't work because it assings codes depending on the different values of the data
def from_data_to_code(x):
return int(x)-1
def codes(self):
return np.array([from_data_to_code(x) for x in self])
# sns.countplot(x=df2[df2.Community=='Comm.Linux_Kernel'].Community,hue=df2[df2.Community=='Comm.Linux_Kernel'].V27)
# codes(df2[df2.V28!='-1'].V28)
# df2.Community.cat.codes.values
# df2[df2.Community=='Comm.Linux_Kernel'].Community
df2.Community.cat.codes.values.shape
def infer(q):
# Number of outcomes
nYlevels2 = 5 # df2.V28.cat.categories.size
# Number of groups
n_grps = df2.Community.nunique()
# Group index
grp_idx = df2[df2[q]!='-1'].Community.cat.codes.values
thresh2 = np.arange(1.5, nYlevels2, dtype=np.float64)
thresh_obs2 = np.ma.asarray(thresh2)
thresh_obs2[1:-1] = np.ma.masked
print('thresh2:\t{}'.format(thresh2))
print('thresh_obs2:\t{}'.format(thresh_obs2))
@as_op(itypes=[tt.dvector, tt.dvector, tt.dvector], otypes=[tt.dmatrix])
def outcome_probabilities(theta, mu, sigma):
out = np.empty((nYlevels2, n_grps), dtype=np.float64)
n = norm(loc=mu, scale=sigma)
out[0,:] = n.cdf(theta[0])
out[1,:] = np.max([np.repeat(0,n_grps), n.cdf(theta[1]) - n.cdf(theta[0])], axis=0)
out[2,:] = np.max([np.repeat(0,n_grps), n.cdf(theta[2]) - n.cdf(theta[1])], axis=0)
out[3,:] = np.max([np.repeat(0,n_grps), n.cdf(theta[3]) - n.cdf(theta[2])], axis=0)
out[4,:] = 1 - n.cdf(theta[3])
return out
with pm.Model() as ordinal_model_multi_groups:
theta = pm.Normal('theta', mu=thresh2, tau=np.repeat(.5**2, len(thresh2)),
shape=len(thresh2), observed=thresh_obs2)
mu = pm.Normal('mu', mu=nYlevels2/2.0, tau=1.0/(nYlevels2**2), shape=n_grps)
sigma = pm.Uniform('sigma', nYlevels2/1000.0, nYlevels2*10.0, shape=n_grps)
pr = outcome_probabilities(theta, mu, sigma)
y = pm.Categorical('y', pr[:,grp_idx].T, observed=codes(df2[df2[q]!='-1'][q])) #df2.V28.cat.codes.to_numpy())
trace2 = pm.sample(5000, tune=1000, cores=4)
return trace2
t=infer('V33')
with ordinal_model_multi_groups:
trace2 = pm.sample(5000, tune=1000, cores=4)
## Mean density plots of answers to variable v for all communities
def mean_densities(t,v):
plt.figure(figsize=(15,5))
NUM_COLORS = 16
cmap = plt.get_cmap('tab20')
colors = [cmap(i) for i in np.linspace(0, 1, NUM_COLORS)]
grp_idxs_colors_grp_names = zip(df2.Community.cat.codes.unique(),colors,df2.Community.unique())
[pm.plot_kde(t['mu'][:,i],
plot_kwargs={'color': c}, cumulative=False,
label=n)
for (i,c,n) in grp_idxs_colors_grp_names]
plt.xlim([0,7])
plt.legend(loc='best')
plt.suptitle('Mean density plots for '+v)
plt.show()
# mean_densities(t,'V33')
pm.plot_kde(t['mu'][:,4])
plt.xlim([0,7])
plt.suptitle('DuckDuckGo question V33')
pm.summary(t)
plt.figure(figsize=(30,10))
sns.countplot(x=df2[df2.V33!='-1'].Community,hue=df2[df2.V33!='-1'].V33)
mu2 = trace2['mu']
sigma2 = trace2['sigma']
# Concatenate the fixed thresholds into the estimated thresholds
n = trace2['theta_missing'].shape[0]
thresholds2 = np.c_[np.tile([1.5], (n,1)),
trace2['theta_missing'],
np.tile([4.5], (n,1))]
fig, axes = plt.subplots(5,2, figsize=(10,14))
ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9,ax10 = axes.flatten()
# Mu
pm.plot_posterior(mu2[:,0], point_estimate='mode', color=color, ax=ax1)
ax1.set_xlabel('$\mu_{1}$', fontdict=f_dict)
pm.plot_posterior(mu2[:,1], point_estimate='mode', color=color, ax=ax3)
ax3.set_xlabel('$\mu_{2}$', fontdict=f_dict)
for title, ax in zip(['Comm.Coala Mean', 'Comm.Linux_Kernel Mean'], [ax1, ax3]):
ax.set_title(title, fontdict=f_dict)
# Sigma
pm.plot_posterior(sigma2[:,0], point_estimate='mode', color=color, ax=ax5)
ax5.set_xlabel('$\sigma_{1}$', fontdict=f_dict)
pm.plot_posterior(sigma2[:,1], point_estimate='mode', color=color, ax=ax7)
ax7.set_xlabel('$\sigma_{2}$', fontdict=f_dict)
for title, ax in zip(['Comm.Coala Std. Dev.', 'Comm.Linux_Kernel Std. Dev.'], [ax5, ax7]):
ax.set_title(title, fontdict=f_dict)
# Posterior distribution on the thresholds
ax9.scatter(thresholds2, np.tile(thresholds2.mean(axis=1).reshape(-1,1), (1,4)), color=color, alpha=.6, facecolor='none')
ax9.set_ylabel('Mean Threshold', fontdict=f_dict)
ax9.set_xlabel('Threshold', fontdict=f_dict)
ax9.vlines(x = thresholds2.mean(axis=0),
ymin=thresholds2.mean(axis=1).min(),
ymax=thresholds2.mean(axis=1).max(), linestyles='dotted', colors=color)
# Posterior predictive probabilities of the outcomes
threshCumProb2A = np.empty(thresholds2.shape)
for i in np.arange(threshCumProb2A.shape[0]):
threshCumProb2A[i] = norm().cdf((thresholds2[i] - mu2[i,0])/sigma2[i,0])
outProb2A = (np.c_[threshCumProb2A, np.tile(1, (thresholds2.shape[0],1))]
- np.c_[np.tile(0, (thresholds2.shape[0],1)), threshCumProb2A])
yerr2A = np.abs(np.subtract(pm.hpd(outProb2A), outProb2A.mean(axis=0).reshape(-1,1)))
ax2.errorbar(x = np.arange(outProb2A.shape[1]), y=outProb2A.mean(axis=0),
yerr=yerr2A.T, color=color, fmt='o')
threshCumProb2B = np.empty(thresholds2.shape)
for i in np.arange(threshCumProb2B.shape[0]):
threshCumProb2B[i] = norm().cdf((thresholds2[i] - mu2[i,1])/sigma2[i,1])
outProb2B = (np.c_[threshCumProb2B, np.tile(1, (thresholds2.shape[0],1))]
- np.c_[np.tile(0, (thresholds2.shape[0],1)), threshCumProb2B])
yerr2B = np.abs(np.subtract(pm.hpd(outProb2B), outProb2B.mean(axis=0).reshape(-1,1)))
ax4.errorbar(x = np.arange(outProb2B.shape[1]), y=outProb2B.mean(axis=0),
yerr=yerr2B.T, color=color, fmt='o')
for grp, ax in zip(['Comm.Coala', 'Comm.Linux_Kernel'], [ax2, ax4]):
((df2[df2.Community == grp].V28.value_counts().sort_index()/df2[df2.Community == grp].V28.size)
.plot.bar(ax=ax, rot=0, color='royalblue'))
ax.set_title('Data for {0} with Post. Pred.\nN = {1}'.format(grp, df2[df2.Community == grp].V28.size), fontdict=f_dict)
ax.set_xlabel('y')
sns.despine(ax=ax, left=True)
ax.yaxis.set_visible(False)
# Mu diff
pm.plot_posterior(mu2[:,1]-mu2[:,0], point_estimate='mode', color=color, ax=ax6)
ax6.set_xlabel('$\mu_{2}-\mu_{1}$', fontdict=f_dict)
# Sigma diff
pm.plot_posterior(sigma2[:,1]-sigma2[:,0], point_estimate='mode', color=color, ax=ax8)
ax8.set_xlabel('$\sigma_{2}-\sigma_{1}$', fontdict=f_dict)
# Effect size
pm.plot_posterior((mu2[:,1]-mu2[:,0]) / np.sqrt((sigma2[:,0]**2+sigma2[:,1]**2)/2), point_estimate='mode', color=color, ax=ax10)
ax10.set_xlabel(r'$\frac{(\mu_2-\mu_1)}{\sqrt{(\sigma_1^2+\sigma_2^2)/2}}$', fontdict=f_dict)
for title, ax in zip(['Differences of Means', 'Difference of Std. Dev\'s', 'Effect Size'], [ax6, ax8, ax10]):
ax.set_title(title, fontdict=f_dict)
fig.suptitle('V28: I do not consider a pull request/patch, unless I trust the contributor.', fontsize=16)
fig.tight_layout()
def count_per_data_value(d,c,q):
return np.array([ len(d[(d[q] == x) & (d.Community==c)][q]) for x in ['1','2','3','4','5']])
def comm_to_code(d,c):
return [code for (comm,code) in zip(d.Community.unique(),d.Community.cat.codes.unique()) if comm==c][0]
comm_to_code(df2,'-1')
def plot_ppc_normal_density(d,t,c,q):
c_code=comm_to_code(d,c)
x=np.arange(-5,10,step=.01)
s=pd.Series((count_per_data_value(df2,c,q)/count_per_data_value(df2,c,q).sum()))
# s.index=s.index+1
s.plot.bar(rot=0, color='royalblue')
plt.suptitle('Data of '+c+' question '+q)
[plt.plot(x,norm(loc=m,scale=s).pdf(x),color='lightblue',alpha=.05)
for (m,s) in zip(t['mu'][18000:,c_code],t['sigma'][18000:, c_code])]
plt.plot(x,norm(loc=np.mean(t['mu'][:,c_code]),scale=np.mean(t['sigma'][:,c_code])).pdf(x),color='blue',linewidth=2)
plt.xlim([-1,7])
plt.show()
[plot_ppc_normal_density(df2,t,c,'V33') for c in df2.Community.unique()]
pm.summary(trace2)
# plt.subplot(211)
x_axis = np.arange(-10, 15, 0.01)
(((df2[df2.Community == 'Comm.Linux_Kernel'].V28.value_counts().sort_index()*.4)/df2[df2.Community == 'Comm.Linux_Kernel'].V28.size)
.plot.bar(rot=0, color='purple'))
[plt.plot(x_axis, norm.pdf(x_axis,m,s),color='royalblue',alpha=.05) for (m,s) in zip(trace2['mu'][18000:,1],trace2['sigma'][18500:,1])]
plt.plot(x_axis, norm.pdf(x_axis,np.mean(trace2['mu'][:,1]),np.mean(trace2['sigma'][:,1])),color='black',linewidth=2)
plt.show()
# plt.subplot(212)
(((df2[df2.Community == 'Comm.Coala'].V28.value_counts().sort_index()*.4)/df2[df2.Community == 'Comm.Coala'].V28.size)
.plot.bar(rot=0, color='purple'))
[plt.plot(x_axis, norm.pdf(x_axis,m,s),color='royalblue',alpha=.05) for (m,s) in zip(trace2['mu'][18000:,0],trace2['sigma'][18500:,0])]
plt.plot(x_axis, norm.pdf(x_axis,np.mean(trace2['mu'][:,0]),np.mean(trace2['sigma'][:,0])),color='black',linewidth=2)
plt.show()